In [1]:
    
import pandas as pd
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
    
In [2]:
    
gtd = pd.read_excel('GTD.xlsx')
    
In [3]:
    
gtd.head()
    
    Out[3]:
In [4]:
    
gtd.region.value_counts()
    
    Out[4]:
In [5]:
    
gtd = gtd.rename(columns={'iyear':'year', 'imonth':'month', 'iday':'day'})
    
In [6]:
    
#Get rid of zeroes in month and day
gtd = gtd[gtd.month != 0]
    
In [7]:
    
gtd = gtd[gtd.day != 0]
    
In [8]:
    
gtd.columns
    
    Out[8]:
In [9]:
    
gtd.isnull().sum()
    
    Out[9]:
In [10]:
    
gtd.attacktype1.value_counts()
    
    Out[10]:
In [11]:
    
gtd.city.value_counts()
    
    Out[11]:
In [12]:
    
gtd.region_txt.value_counts()
    
    Out[12]:
In [13]:
    
# 1. Assassination
# 2. Armed assault 
# 3. Bombing/explosion
# 4. Hijacking
# 5. Hostage Tacking (barricade incident)
# 6. Hostage Taking (kidnapping)
# 7. Facility/infrastructure attack
# 8. Unarmed assault 
# 9. Unkown
    
In [14]:
    
me_attks = gtd[gtd.region_txt=='Middle East & North Africa']
    
In [15]:
    
ax = sns.distplot(me_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in the MENA Region')
plt.show()
    
    
In [16]:
    
s_am_attks = gtd[gtd.region_txt=='South America']
    
In [17]:
    
ax = sns.distplot(s_am_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in South America')
plt.show()
    
    
In [18]:
    
sa_attks = gtd[gtd.region_txt=='South Asia']
ax = sns.distplot(sa_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in South Asia')
plt.show()
    
    
In [19]:
    
we_attks = gtd[gtd.region_txt=='Western Europe']
ax = sns.distplot(we_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Western Europe')
plt.show()
    
    
In [20]:
    
sub_af_attks = gtd[gtd.region_txt=='Sub-Saharan Africa']
ax = sns.distplot(sub_af_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Sub Saharan Africa')
plt.show()
#Only one two show higher armed assaults than bombings.
    
    
In [21]:
    
sea_attks = gtd[gtd.region_txt=='Southeast Asia']
ax = sns.distplot(sea_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Southeast Asia')
plt.show()
    
    
In [22]:
    
cac_attks = gtd[gtd.region_txt=='Central America & Caribbean']
ax = sns.distplot(cac_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Central America and the Caribbean')
plt.show()
    
    
In [23]:
    
ee_attks = gtd[gtd.region_txt=='Eastern Europe']
ax = sns.distplot(ee_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Eastern Europe')
plt.show()
    
    
In [24]:
    
na_attks = gtd[gtd.region_txt=='North America']
ax = sns.distplot(na_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in North America')
plt.show()
    
    
In [25]:
    
ea_attks = gtd[gtd.region_txt=='East Asia']
ax = sns.distplot(ea_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in East Asia')
plt.show()
    
    
In [26]:
    
ca_attks = gtd[gtd.region_txt=='Central Asia']
ax = sns.distplot(ca_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Central Asia')
plt.show()
    
    
In [27]:
    
ao_attks = gtd[gtd.region_txt=='Australasia & Oceania']
ax = sns.distplot(ao_attks.attacktype1, kde=False, bins=9)
ax.set(xlabel='Attack Type', ylabel='Frequency in Australasia & Oceania')
plt.show()
    
    
In [28]:
    
# The most frequent attack across all regions is bombings, except for Sub-Saharan
# Africa and Central America/The Caribbean, where armed assualt is more relatively frequent. 
# In North America, Australia/Oceania, and Western Europe, facility/infrastructure attacks
# are more frequent than in other areas. This makes sense, as these regions are generally 
# more developed than others in terms of urban infrastructure.
    
In [29]:
    
me_attks.groupby('attacktype1').year.count()
    
    Out[29]:
In [30]:
    
me_attks['armed_assault'] = me_attks.apply(lambda x: 1 if x['attacktype1'] == 2 else 0, axis=1)
me_attks['bombings'] = me_attks.apply(lambda x: 1 if x['attacktype1'] == 3 else 0, axis=1)
    
    
In [31]:
    
me_attks.head()
    
    Out[31]:
In [32]:
    
me_attks_grouped_by_year = me_attks.groupby('year').sum()
    
In [33]:
    
me_attks_grouped_by_year = me_attks_grouped_by_year.reset_index()
    
In [34]:
    
me_attks_grouped_by_year.head()
    
    Out[34]:
In [35]:
    
me_bombs_1st_q = me_attks_grouped_by_year[me_attks_grouped_by_year.bombings <= 174]
me_bombs_4th_q = me_attks_grouped_by_year[me_attks_grouped_by_year.bombings >= 269]
    
In [36]:
    
me_bombs_1st_q.nkill.describe()
    
    Out[36]:
In [37]:
    
me_attks_grouped_by_year.bombings.describe()
    
    Out[37]:
In [38]:
    
me_aa_1st_q = me_attks_grouped_by_year[me_attks_grouped_by_year.armed_assault <= 35]
me_aa_4th_q = me_attks_grouped_by_year[me_attks_grouped_by_year.armed_assault >= 204]
    
In [39]:
    
me_attks_grouped_by_year.armed_assault.describe()
    
    Out[39]:
In [40]:
    
plt.style.use('fivethirtyeight')
    
In [41]:
    
ax = sns.distplot(me_bombs_1st_q.bombings)
sns.distplot(me_bombs_4th_q.bombings)
ax.set(xlabel='Distribtuons of Bombings in Each Quartile')
plt.show()
    
    
In [42]:
    
ax = sns.regplot(x=me_bombs_4th_q.bombings, y=me_bombs_4th_q.nkill, data=me_bombs_4th_q)
ax.set(xlabel='Fourth Quartile Bombings in MENA', ylabel='Number Killed')
plt.show()
    
    
In [43]:
    
ax = sns.distplot(me_aa_1st_q.armed_assault)
sns.distplot(me_aa_4th_q.armed_assault)
ax.set(xlabel='Distribtuons of Bombings in Each Quartile for Armed Assault in MENA')
plt.show()
    
    
In [44]:
    
ax = sns.regplot(x=me_aa_1st_q.armed_assault, y=me_aa_1st_q.nkill, data=me_aa_1st_q)
ax.set(xlabel='First Quartile Armed Assaults in MENA', ylabel='Number Killed')
plt.show()
    
    
In [45]:
    
ax = sns.regplot(x=me_aa_4th_q.armed_assault, y=me_aa_4th_q.nkill, data=me_aa_4th_q)
ax.set(xlabel='Fourth Quartile Armed Assaults in MENA', ylabel='Number Killed')
plt.show()
    
    
In [48]:
    
ax = sns.regplot(x=sub_af_aa_4th_q.armed_assault, y=sub_af_aa_4th_q.nkill, data=sub_af_aa_4th_q)
ax.set(xlabel='Fourth Quartile Armed Assaults in Sub Saharan Africa', ylabel='Number Killed')
plt.show()
    
    
In [49]:
    
sub_af_attks_grouped_by_year.armed_assault.describe()
    
    
In [50]:
    
me_bombs_1st_q.columns
    
    Out[50]:
In [51]:
    
# Plotting armed assault across time in the Middle East and Sub-Saharan Africa: 
ax = sns.regplot(x=me_attks_grouped_by_year.year, y=me_attks_grouped_by_year.armed_assault, data=me_attks_grouped_by_year)
ax.set(xlabel='Year', ylabel='Attacks in MENA Region')
plt.title('Armed Assaults in the MENA Region')
plt.show()
    
    
In [52]:
    
sub_af_attks['armed_assault'] = sub_af_attks.apply(lambda x: 1 if x['attacktype1'] == 2 else 0, axis=1)
sub_af_attks['bombings'] = sub_af_attks.apply(lambda x: 1 if x['attacktype1'] == 3 else 0, axis=1)
sub_af_attks_grouped_by_year = sub_af_attks.groupby('year').sum()
sub_af_attks_grouped_by_year = sub_af_attks_grouped_by_year.reset_index()
    
    
In [53]:
    
ax = sns.regplot(x=sub_af_attks_grouped_by_year.year, y=sub_af_attks_grouped_by_year.armed_assault, data=sub_af_attks_grouped_by_year)
ax.set(xlabel='Year', ylabel='Armed Assaults per year')
plt.title('Armed Assaults in Sub-Saharan Africa')
plt.show()
    
    
In [54]:
    
# Plotting bombings across years in the Middle East and Sub-Saharan Africa: 
ax = sns.regplot(x=me_attks_grouped_by_year.year, y=me_attks_grouped_by_year.bombings, data=me_attks_grouped_by_year)
ax.set(xlabel='Year', ylabel='Bombings per year')
plt.title('Bombings Each Year in the MENA Region')
plt.show()
    
    
In [55]:
    
ax = sns.regplot(x=sub_af_attks_grouped_by_year.year, y=sub_af_attks_grouped_by_year.bombings, data=sub_af_attks_grouped_by_year)
ax.set(xlabel='Year', ylabel='Bombings per year')
plt.title('Bombings Each Year in Sub-Saharan Africa')
plt.show()
    
    
In [56]:
    
# Plotting Number killed in each region.
ax = sns.regplot(x=me_attks_grouped_by_year.year, y=me_attks_grouped_by_year.nkill, data=me_attks_grouped_by_year)
ax.set(xlabel='1970 to 2015', ylabel='Number Killed Each Year in the MENA Region')
plt.show()
    
    
In [57]:
    
ax = sns.regplot(x=sub_af_attks_grouped_by_year.year, y=sub_af_attks_grouped_by_year.nkill, data=sub_af_attks_grouped_by_year)
ax.set(xlabel='1970 to 2015', ylabel='Number Killed Each Year in Sub Saharan Africa')
plt.show()
    
    
In [58]:
    
#gtd['date'] = pd.to_datetime((gtd.year*10000+gtd_ts.month*100+gtd.day).apply(str),format='%Y%m%d')
    
In [59]:
    
gtd.country_txt.value_counts()
    
    Out[59]:
In [ ]:
    
#Let's compare the top five in Sub-Saharan Africa and top five in the Middle East.
    
In [60]:
    
sub_af_attks.country_txt.value_counts().head()
    
    Out[60]:
In [61]:
    
me_attks.country_txt.value_counts().head()
    
    Out[61]:
In [62]:
    
nigeria_attks = sub_af_attks[sub_af_attks.country_txt=='Nigeria']
ax = sns.distplot(nigeria_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Nigeria')
plt.show()
    
    
In [63]:
    
somalia_attks = sub_af_attks[sub_af_attks.country_txt=='Somalia']
ax = sns.distplot(somalia_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Somalia')
plt.show()
# Much higher frequency of bombings in Somalia than in Nigeria.
    
    
In [64]:
    
s_af_attks = sub_af_attks[sub_af_attks.country_txt=='South Africa']
ax = sns.distplot(s_af_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in South Africa')
plt.show()
    
    
In [65]:
    
sudan_attks = sub_af_attks[sub_af_attks.country_txt=='Sudan']
ax = sns.distplot(sudan_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Sudan')
plt.show()
    
    
In [66]:
    
kenya_attks = sub_af_attks[sub_af_attks.country_txt=='Kenya']
ax = sns.distplot(kenya_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Kenya')
plt.show()
# Relatively equal bombings and armed assault.
    
    
In [67]:
    
iraq_attks = me_attks[me_attks.country_txt=='Iraq']
ax = sns.distplot(me_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Iraq')
plt.show()
    
    
In [68]:
    
turkey_attks = me_attks[me_attks.country_txt=='Turkey']
ax = sns.distplot(turkey_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Turkey')
plt.show()
    
    
In [69]:
    
algeria_attks = me_attks[me_attks.country_txt=='Algeria']
ax = sns.distplot(algeria_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Algeria')
plt.show()
    
    
In [70]:
    
yemen_attks = me_attks[me_attks.country_txt=='Yemen']
ax = sns.distplot(yemen_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Yemen')
plt.show()
    
    
In [71]:
    
leb_attks = me_attks[me_attks.country_txt=='Lebanon']
ax = sns.distplot(leb_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Lebanon')
plt.show()
    
    
In [ ]:
    
    
In [72]:
    
# compare two populations. 
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import pymc3 as pm
    
In [73]:
    
sub_af_attks['armed_assault'] = sub_af_attks.apply(lambda x: 1 if x['attacktype1'] == 2 else 0, axis=1)
    
    
In [74]:
    
#Comparing populations in Somalia and Nigeria in the 2010s. 
sub_af_00s = sub_af_attks[sub_af_attks.year > 2001]
    
In [75]:
    
#Comparing populations in Somalia and Nigeria in the 2010s. 
sub_af_00s = sub_af_attks[sub_af_attks.year > 2001]
sub_af_00s = sub_af_00s[sub_af_00s.year < 2011]
    
In [76]:
    
nig_00s = sub_af_00s[sub_af_00s.country_txt == 'Nigeria']
som_00s = sub_af_00s[sub_af_00s.country_txt == 'Somalia']
    
In [77]:
    
nig_00s
    
    Out[77]:
In [78]:
    
nig_00s.columns
    
    Out[78]:
In [79]:
    
nig_00s = nig_00s[[0,1,2,4,8,12,13,17,18,19,66]]
    
In [80]:
    
som_00s = som_00s[[0,1,2,4,8,12,13,17,18,19,66]]
    
In [81]:
    
mean_prior_mean = sub_af_00s.armed_assault.mean()
mean_prior_std = sub_af_00s.armed_assault.std()
    
In [82]:
    
with pm.Model() as model:
    nig_mean = pm.Normal('Nigeria Armed Assaults Mean', mean_prior_mean, sd=mean_prior_std)
    som_mean = pm.Normal('Somalia Armed Assaults Mean', mean_prior_mean, sd=mean_prior_std)
    
In [83]:
    
std_prior_lower = 0.01
std_prior_upper = 100.0
with model:
    
    nig_std = pm.Uniform('Nigeria Armed Assaults Std', lower=std_prior_lower, upper=std_prior_upper)
    som_std = pm.Uniform('Somalia Armed Assaults Std', lower=std_prior_lower, upper=std_prior_upper)
    
In [84]:
    
with model:
    nigeria_group = pm.Normal('Nigeria Armed Assaults', mu=nig_mean, sd=nig_std, observed=nig_00s)
    som_group = pm.Normal('Somalia Armed Assaults', mu=som_mean, sd=som_std, observed=som_00s)
    
In [85]:
    
with model:
    diff_of_means = pm.Deterministic('difference of means', nig_mean - som_mean)
    diff_of_stds = pm.Deterministic('difference of stds', nig_std - som_std)
    effect_size = pm.Deterministic('effect size',
                                   diff_of_means / np.sqrt((nig_std**2 + som_std**2) / 2))
    
In [103]:
    
with model:
    trace = pm.sample(10000, njobs=-1)
    
    
In [104]:
    
pm.plot_posterior(trace[3000:],
                  varnames=['Nigeria Armed Assaults Mean', 'Somalia Armed Assaults Mean', 'Nigeria Armed Assaults Std', 'Somalia Armed Assaults Std'],
                  color='#87ceeb')
    
    Out[104]:
    
In [105]:
    
pm.plot_posterior(trace[3000:],
                  varnames=['difference of means', 'difference of stds', 'effect size'],
                  ref_val=0,
                  color='#87ceeb')
    
    Out[105]:
    
In [107]:
    
pm.summary(trace[3000:],
           varnames=['difference of means', 'difference of stds', 'effect size'])
    
    
In [108]:
    
#Drop the 73 null values for lat and long.
    
In [109]:
    
# Predicting for 1993. Might be best to look at the years before and after.
me_attks_b4 = me_attks[me_attks.year == 1992]
me_attks_aftr = me_attks[me_attks.year == 1994]
sub_af_attks_b4 = sub_af_attks[sub_af_attks.year == 1992]
sub_af_attks_aftr = sub_af_attks[sub_af_attks.year == 1994]
    
In [110]:
    
sub_af_attks_b4.head()
    
    Out[110]:
In [111]:
    
sub_af_attks_aftr.head()
    
    Out[111]:
In [112]:
    
bombs_per_month_1 = sub_af_attks_b4.groupby('month').bombings.count()
    
In [113]:
    
bombs_per_month_2 = sub_af_attks_aftr.groupby('month').bombings.count()
    
In [114]:
    
bombs_per_month_93 = np.mean(np.array([bombs_per_month_1, bombs_per_month_2]), axis=0 )
    
In [115]:
    
bombs_per_month_93 = pd.DataFrame(bombs_per_month_93)
    
In [116]:
    
bombs_per_month_93
    
    Out[116]:
In [117]:
    
bombs_per_country_1 = sub_af_attks_b4.groupby('country_txt').bombings.count()
    
In [118]:
    
bombs_per_country_2 = sub_af_attks_aftr.groupby('country_txt').bombings.count()
    
In [119]:
    
bombs_per_country_1 = pd.DataFrame(bombs_per_country_1)
    
In [120]:
    
bombs_per_country_2 = pd.DataFrame(bombs_per_country_2)
    
In [121]:
    
bombs_per_county_93 = pd.concat((bombs_per_country_1, bombs_per_country_2))
bombs_per_county_93.groupby(bombs_per_county_93.index).mean()
    
    Out[121]:
In [ ]: